In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
pd.set_option('display.max_columns', None)
data = pd.read_excel('nba_player_data.xlsx')
In [2]:
data.sample(10)
Out[2]:
| Unnamed: 0 | Year | Season_type | PLAYER_ID | RANK | PLAYER | TEAM_ID | TEAM | GP | MIN | FGM | FGA | FG_PCT | FG3M | FG3A | FG3_PCT | FTM | FTA | FT_PCT | OREB | DREB | REB | AST | STL | BLK | TOV | PF | PTS | EFF | AST_TOV | STL_TOV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2407 | 260 | 2020-21 | Regular%20Season | 1630181 | 260 | R.J. Hampton | 1610612753 | ORL | 51 | 888 | 136 | 313 | 0.435 | 27 | 87 | 0.310 | 55 | 82 | 0.671 | 29 | 150 | 179 | 87 | 21 | 10 | 53 | 61 | 354 | 394 | 1.64 | 0.40 |
| 1168 | 60 | 2022-23 | Playoffs | 1626157 | 61 | Karl-Anthony Towns | 1610612750 | MIN | 5 | 180 | 32 | 70 | 0.457 | 6 | 24 | 0.250 | 21 | 28 | 0.750 | 6 | 45 | 51 | 10 | 3 | 4 | 18 | 21 | 91 | 96 | 0.56 | 0.17 |
| 3328 | 402 | 2019-20 | Regular%20Season | 201587 | 403 | Nicolas Batum | 1610612766 | CHA | 22 | 505 | 28 | 81 | 0.346 | 14 | 49 | 0.286 | 9 | 10 | 0.900 | 25 | 75 | 100 | 66 | 17 | 8 | 22 | 41 | 79 | 194 | 3.00 | 0.77 |
| 3390 | 464 | 2019-20 | Regular%20Season | 1629718 | 464 | Charlie Brown Jr. | 1610612737 | ATL | 10 | 40 | 6 | 19 | 0.316 | 3 | 9 | 0.333 | 5 | 5 | 1.000 | 2 | 2 | 4 | 2 | 2 | 2 | 3 | 3 | 20 | 14 | 0.67 | 0.67 |
| 3589 | 134 | 2019-20 | Playoffs | 1627884 | 135 | Derrick Jones Jr. | 1610612748 | MIA | 15 | 97 | 8 | 17 | 0.471 | 4 | 9 | 0.444 | 2 | 5 | 0.400 | 6 | 6 | 12 | 7 | 6 | 5 | 2 | 12 | 22 | 38 | 3.50 | 3.00 |
| 3623 | 168 | 2019-20 | Playoffs | 1629598 | 169 | Chris Clemons | 1610612745 | HOU | 2 | 8 | 2 | 5 | 0.400 | 2 | 5 | 0.400 | 1 | 2 | 0.500 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 7 | 5 | 0.00 | 0.00 |
| 655 | 86 | 2022-23 | Regular%20Season | 1630180 | 87 | Saddiq Bey | 1610612737 | ATL | 77 | 2129 | 353 | 836 | 0.422 | 151 | 418 | 0.361 | 205 | 238 | 0.861 | 98 | 264 | 362 | 118 | 70 | 13 | 71 | 123 | 1062 | 1038 | 1.66 | 0.99 |
| 465 | 465 | 2023-24 | Regular%20Season | 1641720 | 465 | Jalen Hood-Schifino | 1610612747 | LAL | 21 | 109 | 10 | 45 | 0.222 | 2 | 15 | 0.133 | 12 | 20 | 0.600 | 2 | 11 | 13 | 8 | 3 | 2 | 9 | 13 | 34 | 8 | 0.89 | 0.33 |
| 2078 | 148 | 2021-22 | Playoffs | 1627854 | 148 | Bryn Forbes | 1610612743 | DEN | 5 | 76 | 6 | 15 | 0.400 | 4 | 11 | 0.364 | 4 | 5 | 0.800 | 1 | 2 | 3 | 7 | 1 | 0 | 2 | 7 | 20 | 19 | 3.50 | 0.50 |
| 3360 | 434 | 2019-20 | Regular%20Season | 1629662 | 435 | Mfiondu Kabengele | 1610612746 | LAC | 12 | 64 | 14 | 32 | 0.438 | 9 | 20 | 0.450 | 5 | 5 | 1.000 | 1 | 10 | 11 | 2 | 2 | 2 | 2 | 9 | 42 | 39 | 1.00 | 1.00 |
In [3]:
data.shape
Out[3]:
(3672, 31)
In [4]:
# Data cleaning & analysis preparation
In [5]:
data.isna().sum()
Out[5]:
Unnamed: 0 0 Year 0 Season_type 0 PLAYER_ID 0 RANK 0 PLAYER 0 TEAM_ID 0 TEAM 0 GP 0 MIN 0 FGM 0 FGA 0 FG_PCT 0 FG3M 0 FG3A 0 FG3_PCT 0 FTM 0 FTA 0 FT_PCT 0 OREB 0 DREB 0 REB 0 AST 0 STL 0 BLK 0 TOV 0 PF 0 PTS 0 EFF 0 AST_TOV 0 STL_TOV 0 dtype: int64
In [6]:
data.drop(columns=['RANK','EFF'], inplace=True)
In [7]:
data['season_start_year'] = data['Year'].astype(str).str[:4].astype(int)
In [8]:
data.TEAM.nunique()
Out[8]:
30
In [9]:
# Assuming 'data' is your DataFrame
# Extract the numerical part of the 'Year' column
data['Year'] = data['Year'].str.extract('(\d+)')
# Convert the extracted numerical part to float
data['Year'] = data['Year'].astype(float)
In [10]:
data['Season_type'].replace('Regular%20Season','RS', inplace=True)
In [11]:
rs_df = data[data['Season_type']=='RS']
playoffs_df = data[data['Season_type']=='Playoffs']
In [12]:
data.columns
Out[12]:
Index(['Unnamed: 0', 'Year', 'Season_type', 'PLAYER_ID', 'PLAYER', 'TEAM_ID',
'TEAM', 'GP', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT',
'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
'TOV', 'PF', 'PTS', 'AST_TOV', 'STL_TOV', 'season_start_year'],
dtype='object')
In [13]:
total_cols = ['MIN', 'FGM', 'FGA', 'FG3M', 'FG3A','FTM', 'FTA', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
'TOV', 'PF', 'PTS']
In [14]:
# Which player stats are correlated with each other
In [ ]:
In [15]:
data_per_min = data.groupby(['PLAYER','PLAYER_ID','Year'])[total_cols].sum().reset_index()
for col in data_per_min.columns [4:]:
data_per_min[col] = data_per_min[col]/data_per_min['MIN']
data_per_min['FG%'] = data_per_min['FGM']/data_per_min['FGA']
data_per_min['3PT%'] = data_per_min['FG3M']/data_per_min['FG3A']
data_per_min['FT%'] = data_per_min['FTM']/data_per_min['FTA']
data_per_min['FG3A%'] = data_per_min['FG3A']/data_per_min['FGA']
data_per_min['PTS/FGA'] = data_per_min['PTS']/data_per_min['FGA']
data_per_min['FG3M/FGM'] = data_per_min['FG3M']/data_per_min['FGM']
data_per_min['FTA/FGA'] = data_per_min['FTA']/data_per_min['FGA']
data_per_min['TRU%'] = 0.5*data_per_min['PTS']/(data_per_min['FGA']+0.475*data_per_min['FTA'])
data_per_min['AST_TOV'] = data_per_min['AST']/data_per_min['TOV']
data_per_min = data_per_min[data_per_min['MIN']>=100]
data_per_min.drop(columns='PLAYER_ID', inplace=True)
data_per_min.drop(columns='PLAYER', inplace=True)
data_per_min.drop(columns='Year', inplace=True)
fig = px.imshow(data_per_min.corr())
fig.show()
In [16]:
(data_per_min['MIN']>=100).mean()
Out[16]:
1.0
In [17]:
#How are minutes played distributed
In [18]:
fig = px.histogram(x=rs_df['MIN'], histnorm='percent')
fig.show()
In [19]:
# Minutes Comparison
def hist_data(df=rs_df, min_MIN=0, min_GP=0):
return df.loc[(df['MIN']>=min_MIN) & (df['GP']>min_GP), 'MIN']/\
df.loc[(df['MIN']>=min_MIN) & (df['GP']>min_GP), 'GP']
In [20]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=hist_data(rs_df,50,5), histnorm='percent', name='RS',
xbins={'start':0,'end':48,'size':1}))
fig.add_trace(go.Histogram(x=hist_data(playoffs_df,5,1), histnorm='percent',
name='Playoffs', xbins={'start':0,'end':48,'size':1}))
fig.update_traces(opacity=0.5)
fig.show()
In [21]:
((hist_data(playoffs_df,5,1)>=12)&(hist_data(playoffs_df,5,1)<=34)).mean()
Out[21]:
0.46973365617433416
In [22]:
# Points Comparison
def hist_data(df=rs_df, min_MIN=0, min_GP=0):
return df.loc[(df['MIN']>=min_MIN) & (df['GP']>min_GP), 'PTS']/\
df.loc[(df['MIN']>=min_MIN) & (df['GP']>min_GP), 'GP']
In [23]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=hist_data(rs_df,50,5), histnorm='percent', name='RS',
xbins={'start':0,'end':48,'size':1}))
fig.add_trace(go.Histogram(x=hist_data(playoffs_df,5,1), histnorm='percent',
name='Playoffs', xbins={'start':0,'end':48,'size':1}))
fig.update_traces(opacity=0.5)
fig.show()
In [24]:
# How has the game changed over the past 5 years?
In [25]:
change_df = data.groupby('season_start_year')[total_cols].sum().reset_index()
change_df['POSS_est'] = change_df['FGA']-change_df['OREB']+change_df['TOV']+0.44*change_df['FTA']
change_df = change_df[list(change_df.columns[0:2])+['POSS_est']+list(change_df.columns[2:-1])]
change_df['FG%'] = change_df['FGM']/change_df['FGA']
change_df['3PT%'] = change_df['FG3M']/change_df['FG3A']
change_df['FT%'] = change_df['FTM']/change_df['FTA']
change_df['FG3A%'] = change_df['FG3A']/change_df['FGA']
change_df['PTS/FGA'] = change_df['PTS']/change_df['FGA']
change_df['FG3M/FGM'] = change_df['FG3M']/change_df['FGM']
change_df['FTA/FGA'] = change_df['FTA']/change_df['FGA']
change_df['TRU%'] = 0.5*change_df['PTS']/(change_df['FGA']+0.475*change_df['FTA'])
change_df['AST_TOV'] = change_df['AST']/change_df['TOV']
change_df
Out[25]:
| season_start_year | MIN | POSS_est | FGM | FGA | FG3M | FG3A | FTM | FTA | OREB | DREB | REB | AST | STL | BLK | TOV | PF | PTS | FG% | 3PT% | FT% | FG3A% | PTS/FGA | FG3M/FGM | FTA/FGA | TRU% | AST_TOV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019 | 552262 | 234384.64 | 92997 | 202223 | 28032 | 78279 | 40949 | 52906 | 22802 | 79318 | 102120 | 55445 | 17368 | 11085 | 31685 | 47615 | 254975 | 0.459874 | 0.358104 | 0.773995 | 0.387092 | 1.260861 | 0.301429 | 0.261622 | 0.560746 | 1.749882 |
| 1 | 2020 | 562518 | 235759.48 | 95849 | 205754 | 29549 | 80653 | 39624 | 50917 | 22918 | 80151 | 103069 | 57311 | 17491 | 11272 | 30520 | 45152 | 260871 | 0.465843 | 0.366372 | 0.778208 | 0.391988 | 1.267878 | 0.308287 | 0.247465 | 0.567260 | 1.877818 |
| 2 | 2021 | 635572 | 264004.96 | 106569 | 231293 | 32733 | 92552 | 44740 | 57709 | 27052 | 89602 | 116654 | 64618 | 20006 | 12387 | 34372 | 52038 | 290611 | 0.460753 | 0.353671 | 0.775269 | 0.400150 | 1.256463 | 0.307153 | 0.249506 | 0.561665 | 1.879960 |
| 3 | 2022 | 635386 | 266600.04 | 110010 | 231870 | 32382 | 89926 | 48136 | 61516 | 27403 | 86695 | 114098 | 66265 | 19078 | 12250 | 35066 | 52438 | 300538 | 0.474447 | 0.360096 | 0.782496 | 0.387829 | 1.296149 | 0.294355 | 0.265304 | 0.575545 | 1.889722 |
| 4 | 2023 | 566250 | 236554.12 | 98921 | 208504 | 30096 | 82256 | 40161 | 51223 | 24729 | 77368 | 102097 | 62597 | 17468 | 12070 | 30241 | 44178 | 268099 | 0.474432 | 0.365882 | 0.784042 | 0.394506 | 1.285822 | 0.304243 | 0.245669 | 0.575728 | 2.069938 |
In [26]:
change_per48_df = change_df.copy()
In [27]:
change_per48_df.columns[2:18]
Out[27]:
Index(['POSS_est', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB',
'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
dtype='object')
In [28]:
change_per48_df = change_df.copy()
for col in change_per48_df.columns [2:18]:
change_per48_df[col] = (change_per48_df[col]/change_per48_df['MIN'])*48*5
change_per48_df.drop(columns='MIN', inplace=True)
fig = go.Figure()
for col in change_per48_df.columns[1:]:
fig.add_trace(go.Scatter(x=change_per48_df['season_start_year'],
y=change_per48_df[col], name=col))
fig.show()
In [29]:
print(change_per48_df.columns)
Index(['season_start_year', 'POSS_est', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM',
'FTA', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
'FG%', '3PT%', 'FT%', 'FG3A%', 'PTS/FGA', 'FG3M/FGM', 'FTA/FGA', 'TRU%',
'AST_TOV'],
dtype='object')
In [30]:
change_per100_df = change_df.copy()
for col in change_per100_df.columns [3:18]:
change_per100_df[col] = (change_per100_df[col]/change_per100_df['POSS_est'])*100
change_per100_df.drop(columns=['MIN','POSS_est'], inplace=True)
fig = go.Figure()
for col in change_per100_df.columns[1:]:
fig.add_trace(go.Scatter(x=change_per100_df['season_start_year'],
y=change_per100_df[col], name=col))
fig.show()
In [ ]: